library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)
library(dplyr)
library(ggplot2)
library(tidyr)
library(skimr)
library(here)
## here() starts at D:/VIRTUAL_INTERNSHIP/Quantium
library(janitor)
##
## Attaching package: 'janitor'
##
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(readxl)
library(data.table)
##
## Attaching package: 'data.table'
##
## The following objects are masked from 'package:lubridate':
##
## hour, isoweek, mday, minute, month, quarter, second, wday, week,
## yday, year
##
## The following objects are masked from 'package:dplyr':
##
## between, first, last
##
## The following object is masked from 'package:purrr':
##
## transpose
library(stringr)
transaction_df <- clean_names(transaction)
skim_without_charts(transaction_df)
| Name | transaction_df |
| Number of rows | 264836 |
| Number of columns | 8 |
| _______________________ | |
| Column type frequency: | |
| character | 1 |
| numeric | 7 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| prod_name | 0 | 1 | 17 | 40 | 0 | 114 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 |
|---|---|---|---|---|---|---|---|---|---|
| date | 0 | 1 | 43464.04 | 105.39 | 43282.0 | 43373.0 | 43464.0 | 43555.0 | 43646 |
| store_nbr | 0 | 1 | 135.08 | 76.78 | 1.0 | 70.0 | 130.0 | 203.0 | 272 |
| lylty_card_nbr | 0 | 1 | 135549.48 | 80579.98 | 1000.0 | 70021.0 | 130357.5 | 203094.2 | 2373711 |
| txn_id | 0 | 1 | 135158.31 | 78133.03 | 1.0 | 67601.5 | 135137.5 | 202701.2 | 2415841 |
| prod_nbr | 0 | 1 | 56.58 | 32.83 | 1.0 | 28.0 | 56.0 | 85.0 | 114 |
| prod_qty | 0 | 1 | 1.91 | 0.64 | 1.0 | 2.0 | 2.0 | 2.0 | 200 |
| tot_sales | 0 | 1 | 7.30 | 3.08 | 1.5 | 5.4 | 7.4 | 9.2 | 650 |
transaction_df$date <- as.Date(transaction_df$date, origin = "1899-12-30")
head(transaction_df)
product_words <- data.table(unlist(strsplit(unique(transaction_df$prod_name), " ")))
print(product_words)
## V1
## 1: Natural
## 2: Chip
## 3:
## 4:
## 5:
## ---
## 819: Doritos
## 820: Salsa
## 821: Mild
## 822:
## 823: 300g
##### Remove characters
words_data <- str_replace_all(product_words, "[^[:alnum:]]", " ")
## Warning in stri_replace_all_regex(string, pattern,
## fix_replacement(replacement), : argument is not an atomic vector; coercing
words_data
## [1] "c Natural Chip Compny SeaSalt175g CCs Nacho Cheese 175g Smiths Crinkle Cut Chips Chicken 170g Smiths Chip Thinly S Cream Onion 175g Kettle Tortilla ChpsHny Jlpno Chili 150g Old El Paso Salsa Dip Tomato Mild 300g Smiths Crinkle Chips Salt Vinegar 330g Grain Waves Sweet Chilli 210g Doritos Corn Chip Mexican Jalapeno 150g Grain Waves Sour Cream Chives 210G Kettle Sensations Siracha Lime 150g Twisties Cheese 270g WW Crinkle Cut Chicken 175g Thins Chips Light Tangy 175g CCs Original 175g Burger Rings 220g NCC Sour Cream Garden Chives 175g Doritos Corn Chip Southern Chicken 150g Cheezels Cheese Box 125g Smiths Crinkle Original 330g Infzns Crn Crnchers Tangy Gcamole 110g Kettle Sea Salt And Vinegar 175g Smiths Chip Thinly Cut Original 175g Kettle Original 175g Red Rock Deli Thai Chilli Lime 150g Pringles Sthrn FriedChicken 134g Pringles Sweet Spcy BBQ 134g Red Rock Deli SR Salsa Mzzrlla 150g Thins Chips Originl saltd 175g Red Rock Deli Sp Salt Truffle 150G Smiths Thinly Swt Chli S Cream175G Kettle Chilli 175g Doritos Mexicana 170g Smiths Crinkle Cut French OnionDip 150g Natural ChipCo Hony Soy Chckn175g Dorito Corn Chp Supreme 380g Twisties Chicken270g Smiths Thinly Cut Roast Chicken 175g Smiths Crinkle Cut Tomato Salsa 150g Kettle Mozzarella Basil Pesto 175g Infuzions Thai SweetChili PotatoMix 110g Kettle Sensations Camembert Fig 150g Smith Crinkle Cut Mac N Cheese 150g Kettle Honey Soy Chicken 175g Thins Chips Seasonedchicken 175g Smiths Crinkle Cut Salt Vinegar 170g Infuzions BBQ Rib Prawn Crackers 110g GrnWves Plus Btroot Chilli Jam 180g Tyrrells Crisps Lightly Salted 165g Kettle Sweet Chilli And Sour Cream 175g Doritos Salsa Medium 300g Kettle 135g Swt Pot Sea Salt Pringles SourCream Onion 134g Doritos Corn Chips Original 170g Twisties Cheese Burger 250g Old El Paso Salsa Dip Chnky Tom Ht300g Cobs Popd Swt Chlli Sr Cream Chips 110g Woolworths Mild Salsa 300g Natural Chip Co Tmato Hrb Spce 175g Smiths Crinkle Cut Chips Original 170g Cobs Popd Sea Salt Chips 110g Smiths Crinkle Cut Chips Chs Onion170g French Fries Potato Chips 175g Old El Paso Salsa Dip Tomato Med 300g Doritos Corn Chips Cheese Supreme 170g Pringles Original Crisps 134g RRD Chilli Coconut 150g WW Original Corn Chips 200g Thins Potato Chips Hot Spicy 175g Cobs Popd Sour Crm Chives Chips 110g Smiths Crnkle Chip Orgnl Big Bag 380g Doritos Corn Chips Nacho Cheese 170g Kettle Sensations BBQ Maple 150g WW D Style Chip Sea Salt 200g Pringles Chicken Salt Crips 134g WW Original Stacked Chips 160g Smiths Chip Thinly CutSalt Vinegr175g Cheezels Cheese 330g Tostitos Lightly Salted 175g Thins Chips Salt Vinegar 175g Smiths Crinkle Cut Chips Barbecue 170g Cheetos Puffs 165g RRD Sweet Chilli Sour Cream 165g WW Crinkle Cut Original 175g Tostitos Splash Of Lime 175g Woolworths Medium Salsa 300g Kettle Tortilla ChpsBtroot Ricotta 150g CCs Tasty Cheese 175g Woolworths Cheese Rings 190g Tostitos Smoked Chipotle 175g Pringles Barbeque 134g WW Supreme Cheese Corn Chips 200g Pringles Mystery Flavour 134g Tyrrells Crisps Ched Chives 165g Snbts Whlgrn Crisps Cheddr Mstrd 90g Cheetos Chs Bacon Balls 190g Pringles Slt Vingar 134g Infuzions SourCream Herbs Veg Strws 110g Kettle Tortilla ChpsFeta Garlic 150g Infuzions Mango Chutny Papadums 70g RRD Steak Chimuchurri 150g RRD Honey Soy Chicken 165g Sunbites Whlegrn Crisps Frch Onin 90g RRD Salt Vinegar 165g Doritos Cheese Supreme 330g Smiths Crinkle Cut Snag Sauce 150g WW Sour Cream OnionStacked Chips 160g RRD Lime Pepper 165g Natural ChipCo Sea Salt Vinegr 175g Red Rock Deli Chikn Garlic Aioli 150g RRD SR Slow Rst Pork Belly 150g RRD Pc Sea Salt 165g Smith Crinkle Cut Bolognese 150g Doritos Salsa Mild 300g "
#### Remove digits
words_clean <- gsub('[[:digit:]]+', '',words_data)
words_clean
## [1] "c Natural Chip Compny SeaSaltg CCs Nacho Cheese g Smiths Crinkle Cut Chips Chicken g Smiths Chip Thinly S Cream Onion g Kettle Tortilla ChpsHny Jlpno Chili g Old El Paso Salsa Dip Tomato Mild g Smiths Crinkle Chips Salt Vinegar g Grain Waves Sweet Chilli g Doritos Corn Chip Mexican Jalapeno g Grain Waves Sour Cream Chives G Kettle Sensations Siracha Lime g Twisties Cheese g WW Crinkle Cut Chicken g Thins Chips Light Tangy g CCs Original g Burger Rings g NCC Sour Cream Garden Chives g Doritos Corn Chip Southern Chicken g Cheezels Cheese Box g Smiths Crinkle Original g Infzns Crn Crnchers Tangy Gcamole g Kettle Sea Salt And Vinegar g Smiths Chip Thinly Cut Original g Kettle Original g Red Rock Deli Thai Chilli Lime g Pringles Sthrn FriedChicken g Pringles Sweet Spcy BBQ g Red Rock Deli SR Salsa Mzzrlla g Thins Chips Originl saltd g Red Rock Deli Sp Salt Truffle G Smiths Thinly Swt Chli S CreamG Kettle Chilli g Doritos Mexicana g Smiths Crinkle Cut French OnionDip g Natural ChipCo Hony Soy Chckng Dorito Corn Chp Supreme g Twisties Chickeng Smiths Thinly Cut Roast Chicken g Smiths Crinkle Cut Tomato Salsa g Kettle Mozzarella Basil Pesto g Infuzions Thai SweetChili PotatoMix g Kettle Sensations Camembert Fig g Smith Crinkle Cut Mac N Cheese g Kettle Honey Soy Chicken g Thins Chips Seasonedchicken g Smiths Crinkle Cut Salt Vinegar g Infuzions BBQ Rib Prawn Crackers g GrnWves Plus Btroot Chilli Jam g Tyrrells Crisps Lightly Salted g Kettle Sweet Chilli And Sour Cream g Doritos Salsa Medium g Kettle g Swt Pot Sea Salt Pringles SourCream Onion g Doritos Corn Chips Original g Twisties Cheese Burger g Old El Paso Salsa Dip Chnky Tom Htg Cobs Popd Swt Chlli Sr Cream Chips g Woolworths Mild Salsa g Natural Chip Co Tmato Hrb Spce g Smiths Crinkle Cut Chips Original g Cobs Popd Sea Salt Chips g Smiths Crinkle Cut Chips Chs Oniong French Fries Potato Chips g Old El Paso Salsa Dip Tomato Med g Doritos Corn Chips Cheese Supreme g Pringles Original Crisps g RRD Chilli Coconut g WW Original Corn Chips g Thins Potato Chips Hot Spicy g Cobs Popd Sour Crm Chives Chips g Smiths Crnkle Chip Orgnl Big Bag g Doritos Corn Chips Nacho Cheese g Kettle Sensations BBQ Maple g WW D Style Chip Sea Salt g Pringles Chicken Salt Crips g WW Original Stacked Chips g Smiths Chip Thinly CutSalt Vinegrg Cheezels Cheese g Tostitos Lightly Salted g Thins Chips Salt Vinegar g Smiths Crinkle Cut Chips Barbecue g Cheetos Puffs g RRD Sweet Chilli Sour Cream g WW Crinkle Cut Original g Tostitos Splash Of Lime g Woolworths Medium Salsa g Kettle Tortilla ChpsBtroot Ricotta g CCs Tasty Cheese g Woolworths Cheese Rings g Tostitos Smoked Chipotle g Pringles Barbeque g WW Supreme Cheese Corn Chips g Pringles Mystery Flavour g Tyrrells Crisps Ched Chives g Snbts Whlgrn Crisps Cheddr Mstrd g Cheetos Chs Bacon Balls g Pringles Slt Vingar g Infuzions SourCream Herbs Veg Strws g Kettle Tortilla ChpsFeta Garlic g Infuzions Mango Chutny Papadums g RRD Steak Chimuchurri g RRD Honey Soy Chicken g Sunbites Whlegrn Crisps Frch Onin g RRD Salt Vinegar g Doritos Cheese Supreme g Smiths Crinkle Cut Snag Sauce g WW Sour Cream OnionStacked Chips g RRD Lime Pepper g Natural ChipCo Sea Salt Vinegr g Red Rock Deli Chikn Garlic Aioli g RRD SR Slow Rst Pork Belly g RRD Pc Sea Salt g Smith Crinkle Cut Bolognese g Doritos Salsa Mild g "
#### Make a table
words_product <- data.table(unlist(strsplit(unique(words_clean)," ")))
setnames(words_product, "words")
words_product
#### Remove blank, count, and sort
words_product %>%
mutate(words = na_if(words, "")) %>%
filter(!is.na(words)) %>%
group_by(words) %>%
count(words, sort= TRUE)
There are salsa products in the dataset
#### create salsa phrase
remove_salsa <- c('salsa', 'Salsa','SALSA')
#### remove rows than contain salsa on transaction dataset
clean_transaction <- transaction_df[ !grepl(paste(remove_salsa, collapse="|"), transaction_df$prod_name),]
summary(clean_transaction)
## date store_nbr lylty_card_nbr txn_id
## Min. :2018-07-01 Min. : 1.0 Min. : 1000 Min. : 1
## 1st Qu.:2018-09-30 1st Qu.: 70.0 1st Qu.: 70015 1st Qu.: 67569
## Median :2018-12-30 Median :130.0 Median : 130367 Median : 135183
## Mean :2018-12-30 Mean :135.1 Mean : 135531 Mean : 135131
## 3rd Qu.:2019-03-31 3rd Qu.:203.0 3rd Qu.: 203084 3rd Qu.: 202654
## Max. :2019-06-30 Max. :272.0 Max. :2373711 Max. :2415841
## prod_nbr prod_name prod_qty tot_sales
## Min. : 1.00 Length:246742 Min. : 1.000 Min. : 1.700
## 1st Qu.: 26.00 Class :character 1st Qu.: 2.000 1st Qu.: 5.800
## Median : 53.00 Mode :character Median : 2.000 Median : 7.400
## Mean : 56.35 Mean : 1.908 Mean : 7.321
## 3rd Qu.: 87.00 3rd Qu.: 2.000 3rd Qu.: 8.800
## Max. :114.00 Max. :200.000 Max. :650.000
There are no nulls in the columns but product quantity appears to have an outlier which we should investigate further. Let’s investigate further the case where 200 packets of chips are bought in one transaction.
clean_transaction %>% group_by(prod_name) %>% filter(prod_qty == 200)
There are two transactions where 200 packets of chips are bought in one transaction and both of these transactions were by the same customer
clean_transaction %>% filter (lylty_card_nbr==226000)
It looks like this customer (226000) has only had the two transactions over the year and is not an ordinary retail customer. The customer might be buying chips for commercial purposes instead. We’ll remove this loyalty card number from further analysis
new_transaction <- clean_transaction %>% filter(lylty_card_nbr!=226000)
Look at the number of transaction lines over time to see if there are any obvious data issues such as missing data #### Count the number of transactions by date
new_transaction %>% count(date)
There’s only 364 rows, meaning only 364 dates which indicates a missing date. Let’s create a sequence of dates from 1 Jul 2018 to 30 Jun 2019 and use this to create a chart of number of transactions over time to find the missing date
Create a column of dates that includes every day from 1 Jul 2018 to 30 Jun 2019, and join it onto the data to fill in the missing day
#### Sequence of date
all_dates <- data.frame(date = seq(as.Date("2018-07-01"), as.Date("2019-06-30"), by = "day"))
#### Join squence of date and new_transaction date
transaction_by_day <- all_dates %>%
left_join(new_transaction, by = "date") %>% count(date)
transaction_by_day
theme_set(theme_bw())
theme_update(plot.title = element_text(hjust = 0.5))
ggplot(transaction_by_day, aes(x = date, y = n)) +
geom_line() +
labs(x = "Day", y = "Number of transactions", title = "Transactions over time") +
scale_x_date(breaks = "1 month") + theme(axis.text.x = element_text(angle = 90, vjust = 0.5))
There is an increase in purchases in December and a break in late
December.
december_data <- subset(transaction_by_day, format(date, "%m") == "12")
ggplot(december_data, aes(x = date, y = n)) +
geom_line() +
labs(x = "Day", y = "Number of transactions", title = "Transactions over time") +
scale_x_date(breaks = "1 day") + theme(axis.text.x = element_text(angle = 90, vjust = 0.5))
We can see that the increase in sales occurs in the lead-up to Christmas and that there are zero sales on Christmas day itself. This is due to shops being closed on Christmas day.
Now that we are satisfied that the data no longer has outliers, we can move on to creating other features such as brand of chips or pack size from PROD_NAME. We will start with pack size. #### Pack size We can work this out by taking the digits that are in prod_name
setDT(new_transaction)
new_transaction[, pack_size := parse_number(prod_name)]
new_transaction
transaction_pack_size <- new_transaction[, .N,pack_size][order(pack_size)]
transaction_pack_size
The largest size is 380g and the smallest size is 70g - seems sensible!
ggplot(transaction_pack_size, aes(x = pack_size)) +
geom_histogram(binwidth=50,color = 'white',fill ='blue') +
labs(x = "Pack Size", y = "Number of transactions", title = "Transactions by Pack Size")+
theme_minimal()
Create a column which contains the brand of the product, by extracting it from the product name
brand_transaction <- new_transaction %>%
mutate(brand = str_extract(prod_name, "^[^\\d]+"))
brand_transaction %>% group_by(brand)
Some of the brand names look like they are of the same brands - such as RED and RRD, which are both Red Rock Deli chips
clean_transaction <- brand_transaction %>%
mutate(brand = case_when(brand == "RED" ~ "RRD",TRUE ~ brand))
clean_transaction
purchase_df <- clean_names(purchase_behaviour)
skim_without_charts(purchase_df)
| Name | purchase_df |
| Number of rows | 72637 |
| Number of columns | 3 |
| _______________________ | |
| Column type frequency: | |
| character | 2 |
| numeric | 1 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| lifestage | 0 | 1 | 8 | 22 | 0 | 7 | 0 |
| premium_customer | 0 | 1 | 6 | 10 | 0 | 3 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 |
|---|---|---|---|---|---|---|---|---|---|
| lylty_card_nbr | 0 | 1 | 136185.9 | 89892.93 | 1000 | 66202 | 134040 | 203375 | 2373711 |
all_data <- merge(clean_transaction, purchase_df, all.x = TRUE)
all_data
As the number of rows in all_data is the same as that of clean_transaction, we can be sure that no duplicates were created. This is because we created all_data by setting all.x = TRUE (in other words, a left join) which means take all the rows in clean_transaction and find rows with matching values in shared columns and then joining the details in these rows to the x or the first mentioned table.
skim_without_charts(all_data)
| Name | all_data |
| Number of rows | 246740 |
| Number of columns | 12 |
| Key | lylty_card_nbr |
| _______________________ | |
| Column type frequency: | |
| character | 4 |
| Date | 1 |
| numeric | 7 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| prod_name | 0 | 1 | 17 | 40 | 0 | 105 | 0 |
| brand | 0 | 1 | 7 | 37 | 0 | 105 | 0 |
| lifestage | 0 | 1 | 8 | 22 | 0 | 7 | 0 |
| premium_customer | 0 | 1 | 6 | 10 | 0 | 3 | 0 |
Variable type: Date
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| date | 0 | 1 | 2018-07-01 | 2019-06-30 | 2018-12-30 | 364 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 |
|---|---|---|---|---|---|---|---|---|---|
| lylty_card_nbr | 0 | 1 | 135530.25 | 80715.20 | 1000.0 | 70015.00 | 130367.0 | 203083.2 | 2373711.0 |
| store_nbr | 0 | 1 | 135.05 | 76.79 | 1.0 | 70.00 | 130.0 | 203.0 | 272.0 |
| txn_id | 0 | 1 | 135130.36 | 78147.60 | 1.0 | 67568.75 | 135181.5 | 202652.2 | 2415841.0 |
| prod_nbr | 0 | 1 | 56.35 | 33.70 | 1.0 | 26.00 | 53.0 | 87.0 | 114.0 |
| prod_qty | 0 | 1 | 1.91 | 0.34 | 1.0 | 2.00 | 2.0 | 2.0 | 5.0 |
| tot_sales | 0 | 1 | 7.32 | 2.47 | 1.7 | 5.80 | 7.4 | 8.8 | 29.5 |
| pack_size | 0 | 1 | 175.58 | 59.43 | 70.0 | 150.00 | 170.0 | 175.0 | 380.0 |
There are no nulls. So all our customers in the transaction data has been accounted for in the customer dataset
write.csv(all_data, "all_data.csv",row.names = FALSE)